def parse(self):
        text = Text()

        with open(self.filename, "r") as file_:
            for line_unenc in file_:
                self._progress += 1
                line = unicode(line_unenc.encode('utf8'))
                if line.startswith(TEXT_SEPARATOR):
                    if (len(text.sentences) > 0):
                        yield text
                        text = Text()
                        continue
                sentences = self.nlp_pipeline.sentence_segmentation(line)
                for sentence in sentences:
                    s = Sentence()
                    s.set_sentence_text(sentence)
                    s.set_tokens(self.nlp_pipeline.parse_text(sentence))
                    text.add_sentence(s)
        if (len(text.sentences) > 0):
            yield text
Exemplo n.º 2
0
    def parse(self):
        text = Text()

        with open(self.filename, "r") as file_:
            for line_unenc in file_:
                self._progress += 1
                line = unicode(line_unenc.encode('utf8'))
                if line.startswith(TEXT_SEPARATOR):
                    if (len(text.sentences) > 0):
                        yield text
                        text = Text()
                        continue
                sentences = self.nlp_pipeline.sentence_segmentation(line)
                for sentence in sentences:
                    s = Sentence()
                    s.set_sentence_text(sentence)
                    s.set_tokens(self.nlp_pipeline.parse_text(sentence))
                    text.add_sentence(s)
        if (len(text.sentences) > 0):
            yield text
    def parse(self):
        with open(self.filename, "r") as file_:
            text = Text()
            sentence = Sentence()
            sentence.tokens = []

            for line_unenc in file_:
                # end of a text reached
                if line_unenc.rstrip() == END_OF_TEXT_MARKER:
                    yield text
                    text = Text()
                    continue

                self._progress += 1

                # parse line
                line = unicode(line_unenc, errors='ignore')
                line = line.rstrip()

                # split line into word, pos_tags and type
                line_parts = line.split('\t')
                word = self._get_word(line_parts)
                if word is None:
                    continue
                pos_tags = self._get_pos_tags(line_parts)
                punctuation = self._get_punctuation(line_parts)

                sentence.tokens.extend(self._create_tokens(word, pos_tags, punctuation))

                # we are at the end of a sentence
                if punctuation == 'PERIOD':
                    if self.POS_TAGGING and not pos_tags:
                        self.nlp_pipeline.pos_tag(sentence.tokens)
                    text.add_sentence(sentence)
                    sentence = Sentence()
                    sentence.tokens = []

        # if we do not have any end-of-text-marker
        # return everything as one text
        if len(text.sentences) > 0:
            yield text