def parse(self): text = Text() with open(self.filename, "r") as file_: for line_unenc in file_: self._progress += 1 line = unicode(line_unenc.encode('utf8')) if line.startswith(TEXT_SEPARATOR): if (len(text.sentences) > 0): yield text text = Text() continue sentences = self.nlp_pipeline.sentence_segmentation(line) for sentence in sentences: s = Sentence() s.set_sentence_text(sentence) s.set_tokens(self.nlp_pipeline.parse_text(sentence)) text.add_sentence(s) if (len(text.sentences) > 0): yield text
def parse(self): with open(self.filename, "r") as file_: text = Text() sentence = Sentence() sentence.tokens = [] for line_unenc in file_: # end of a text reached if line_unenc.rstrip() == END_OF_TEXT_MARKER: yield text text = Text() continue self._progress += 1 # parse line line = unicode(line_unenc, errors='ignore') line = line.rstrip() # split line into word, pos_tags and type line_parts = line.split('\t') word = self._get_word(line_parts) if word is None: continue pos_tags = self._get_pos_tags(line_parts) punctuation = self._get_punctuation(line_parts) sentence.tokens.extend(self._create_tokens(word, pos_tags, punctuation)) # we are at the end of a sentence if punctuation == 'PERIOD': if self.POS_TAGGING and not pos_tags: self.nlp_pipeline.pos_tag(sentence.tokens) text.add_sentence(sentence) sentence = Sentence() sentence.tokens = [] # if we do not have any end-of-text-marker # return everything as one text if len(text.sentences) > 0: yield text