Python NlpPipeline.parse_text示例

编程语言: Python

命名空间/包名称: preprocessing.nlp_pipeline

类/类型: NlpPipeline

方法/功能: parse_text

hotexamples.com的示例: 6

Python NlpPipeline.parse_text - 已找到6个示例。这些是从开源项目中提取的最受好评的preprocessing.nlp_pipeline.NlpPipeline.parse_text现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

NlpPipeline(7)

parse_text(4)

pos_tag(3)

process_word(1)

sentence_segmentation(1)

示例#1

显示文件

class PlaintextParser(AbstractParser):
    def __init__(self, filename):
        super(PlaintextParser, self).__init__(filename)
        if not self.wants_this_file():
            return
        self._init_line_count_progress()
        self.nlp_pipeline = NlpPipeline()

    def _wanted_file_endings(self):
        return (".txt", )

    def parse(self):
        text = Text()

        with open(self.filename, "r") as file_:
            for line_unenc in file_:
                self._progress += 1
                line = unicode(line_unenc.encode('utf8'))
                if line.startswith(TEXT_SEPARATOR):
                    if (len(text.sentences) > 0):
                        yield text
                        text = Text()
                        continue
                sentences = self.nlp_pipeline.sentence_segmentation(line)
                for sentence in sentences:
                    s = Sentence()
                    s.set_sentence_text(sentence)
                    s.set_tokens(self.nlp_pipeline.parse_text(sentence))
                    text.add_sentence(s)
        if (len(text.sentences) > 0):
            yield text

    def progress(self):
        return self._line_count_progress()

示例#2

显示文件

文件： plaintext_parser.py 项目： anukat2015/sentence-boundary-detection-nn

class PlaintextParser(AbstractParser):
    def __init__(self, filename):
        super(PlaintextParser, self).__init__(filename)
        if not self.wants_this_file():
            return
        self._init_line_count_progress()
        self.nlp_pipeline = NlpPipeline()

    def _wanted_file_endings(self):
        return (".txt",)

    def parse(self):
        text = Text()

        with open(self.filename, "r") as file_:
            for line_unenc in file_:
                self._progress += 1
                line = unicode(line_unenc.encode('utf8'))
                if line.startswith(TEXT_SEPARATOR):
                    if (len(text.sentences) > 0):
                        yield text
                        text = Text()
                        continue
                sentences = self.nlp_pipeline.sentence_segmentation(line)
                for sentence in sentences:
                    s = Sentence()
                    s.set_sentence_text(sentence)
                    s.set_tokens(self.nlp_pipeline.parse_text(sentence))
                    text.add_sentence(s)
        if (len(text.sentences) > 0):
            yield text

    def progress(self):
        return self._line_count_progress()

示例#3

显示文件

class InputText(object):

    def __init__(self, text):
        self.text = text

        self.nlp_pipeline = NlpPipeline()
        self.gold_tokens = self.nlp_pipeline.parse_text(self.text)

    def get_gold_tokens(self):
        return self.gold_tokens

示例#4

显示文件

class XMLParser(AbstractParser):
    def __init__(self, filename):
        super(XMLParser, self).__init__(filename)
        if not self.wants_this_file():
            return
        self.nlp_pipeline = NlpPipeline()
        self._linenumber = self._count_docs()
        self._progress = 0

    def _wanted_file_endings(self):
        return (".xml", )

    def parse(self):
        mteval = xml.etree.ElementTree.parse(self.filename).getroot()
        srcset = mteval.find("srcset")
        for doc in srcset.findall('doc'):
            self._progress += 1
            talk = Text()

            for sentence in doc.findall("seg"):
                sentence_text = unicode(sentence.text)

                sentence = Sentence()
                sentence.set_sentence_text(sentence_text)
                sentence.set_tokens(
                    self.nlp_pipeline.parse_text(sentence_text))
                talk.add_sentence(sentence)

            yield talk

    def progress(self):
        return self._line_count_progress()

    def _count_docs(self):
        mteval = xml.etree.ElementTree.parse(self.filename).getroot()
        srcset = mteval.find("srcset")
        i = 0
        for doc in srcset.findall('doc'):
            i += 1
        return i

示例#5

显示文件

文件： classification_input.py 项目： anukat2015/sentence-boundary-detection-nn

 def _initialize_with_text(self, text):
     nlp_pipeline = NlpPipeline()
     self.tokens = nlp_pipeline.parse_text(text)

示例#6

显示文件

文件： classification_input.py 项目： yangyaoyunshu/sentence-boundary-detection-nn

 def _initialize_with_text(self, text):
     nlp_pipeline = NlpPipeline()
     self.tokens = nlp_pipeline.parse_text(text)