else: pos_tag_str = line_parts[1].split(",") pos_tag_types = map(lambda x: x.split(".")[1], pos_tag_str) return set(map(lambda x: PosTag[x], pos_tag_types)) def progress(self): return self._line_count_progress() def _create_tokens(self, word, pos_tags, punctuation): word_token = WordToken(word) word_token.set_pos_tags(pos_tags) punctuation_token = None if punctuation == 'PERIOD': punctuation_token = PunctuationToken(punctuation, Punctuation.PERIOD) elif punctuation == 'COMMA': punctuation_token = PunctuationToken(punctuation, Punctuation.COMMA) if punctuation_token is not None: return [word_token, punctuation_token] return [word_token] ################ # Example call # ################ if __name__ == '__main__': parse_command_line_arguments(LineParser)
last_end = token.begin + token.duration last_token = token return audio def _extract_talk_id(self, line): line = line[2:] line_parts = line.split("talkid") relevant = line_parts[1] talkid = "0" for i in range(0, len(relevant)): if relevant[i].isdigit(): talkid += relevant[i] else: break return int(talkid) def progress(self): return self._line_count_progress() ################ # Example call # ################ if __name__ == "__main__": parse_command_line_arguments(CtmParser)
text = Text() with open(self.filename, "r") as file_: for line_unenc in file_: self._progress += 1 line = unicode(line_unenc.encode('utf8')) if line.startswith(TEXT_SEPARATOR): if (len(text.sentences) > 0): yield text text = Text() continue sentences = self.nlp_pipeline.sentence_segmentation(line) for sentence in sentences: s = Sentence() s.set_sentence_text(sentence) s.set_tokens(self.nlp_pipeline.parse_text(sentence)) text.add_sentence(s) if (len(text.sentences) > 0): yield text def progress(self): return self._line_count_progress() ################ # Example call # ################ if __name__ == '__main__': parse_command_line_arguments(PlaintextParser)
for sentence in doc.findall("seg"): sentence_text = unicode(sentence.text) sentence = Sentence() sentence.set_sentence_text(sentence_text) sentence.set_tokens( self.nlp_pipeline.parse_text(sentence_text)) talk.add_sentence(sentence) yield talk def progress(self): return self._line_count_progress() def _count_docs(self): mteval = xml.etree.ElementTree.parse(self.filename).getroot() srcset = mteval.find("srcset") i = 0 for doc in srcset.findall('doc'): i += 1 return i ################ # Example call # ################ if __name__ == '__main__': parse_command_line_arguments(XMLParser)
last_end = token.begin + token.duration last_token = token return audio def _extract_talk_id(self, line): line = line[2:] line_parts = line.split("talkid") relevant = line_parts[1] talkid = "0" for i in range(0, len(relevant)): if relevant[i].isdigit(): talkid += relevant[i] else: break return int(talkid) def progress(self): return self._line_count_progress() ################ # Example call # ################ if __name__ == '__main__': parse_command_line_arguments(CtmParser)
text = Text() with open(self.filename, "r") as file_: for line_unenc in file_: self._progress += 1 line = unicode(line_unenc.encode('utf8')) if line.startswith(TEXT_SEPARATOR): if (len(text.sentences) > 0): yield text text = Text() continue sentences = self.nlp_pipeline.sentence_segmentation(line) for sentence in sentences: s = Sentence() s.set_sentence_text(sentence) s.set_tokens(self.nlp_pipeline.parse_text(sentence)) text.add_sentence(s) if (len(text.sentences) > 0): yield text def progress(self): return self._line_count_progress() ################ # Example call # ################ if __name__ == '__main__': parse_command_line_arguments(PlaintextParser)