def __init__(self, filename): super(XMLParser, self).__init__(filename) if not self.wants_this_file(): return self.nlp_pipeline = NlpPipeline() self._linenumber = self._count_docs() self._progress = 0
def _initialize_with_tokens(self, tokens): # convert tokens to WordTokens word_tokens = [WordToken(token) for token in tokens] # do pos_tagging if needed if sbd.config.getboolean('features', 'pos_tagging'): nlp_pipeline = NlpPipeline() nlp_pipeline.pos_tag(wordTokens) self.tokens = word_tokens
def __init__(self, filename): super(LineParser, self).__init__(filename) if not self.wants_this_file(): return self._init_line_count_progress() # if sbd.config.getboolean('features', 'use_question_mark'): # raise ValueError("Question marks not supported by LineParser") self.POS_TAGGING = sbd.config.getboolean('features', 'pos_tagging') self.nlp_pipeline = NlpPipeline()
def _initialize_with_talks(self, talks): nlp_pipeline = NlpPipeline() word_tokens = [] for talk in talks: for sentence in talk.sentences: sentence_tokens = [] # get all word tokens for token in sentence.tokens: if not token.is_punctuation(): sentence_tokens.append(WordToken(token.word)) # do pos_tagging if needed on sentence level if sbd.config.getboolean('features', 'pos_tagging'): nlp_pipeline.pos_tag(sentence_tokens) for t in sentence_tokens: t.word = t.word.lower() word_tokens += sentence_tokens self.tokens = word_tokens
def _initialize_with_text(self, text): nlp_pipeline = NlpPipeline() self.tokens = nlp_pipeline.parse_text(text)
def __init__(self, text): self.text = text self.nlp_pipeline = NlpPipeline() self.gold_tokens = self.nlp_pipeline.parse_text(self.text)
def __init__(self, filename): super(PlaintextParser, self).__init__(filename) if not self.wants_this_file(): return self._init_line_count_progress() self.nlp_pipeline = NlpPipeline()