示例#1
0
 def __init__(self, filename):
     super(XMLParser, self).__init__(filename)
     if not self.wants_this_file():
         return
     self.nlp_pipeline = NlpPipeline()
     self._linenumber = self._count_docs()
     self._progress = 0
    def _initialize_with_tokens(self, tokens):
        # convert tokens to WordTokens
        word_tokens = [WordToken(token) for token in tokens]

        # do pos_tagging if needed
        if sbd.config.getboolean('features', 'pos_tagging'):
            nlp_pipeline = NlpPipeline()
            nlp_pipeline.pos_tag(wordTokens)

        self.tokens = word_tokens
    def __init__(self, filename):
        super(LineParser, self).__init__(filename)
        if not self.wants_this_file():
            return
            
        self._init_line_count_progress()
        # if sbd.config.getboolean('features', 'use_question_mark'):
        #     raise ValueError("Question marks not supported by LineParser")

        self.POS_TAGGING = sbd.config.getboolean('features', 'pos_tagging')
        self.nlp_pipeline = NlpPipeline()
    def _initialize_with_talks(self, talks):
        nlp_pipeline = NlpPipeline()
        word_tokens = []

        for talk in talks:
            for sentence in talk.sentences:
                sentence_tokens = []
                # get all word tokens
                for token in sentence.tokens:
                    if not token.is_punctuation():
                        sentence_tokens.append(WordToken(token.word))
                # do pos_tagging if needed on sentence level
                if sbd.config.getboolean('features', 'pos_tagging'):
                    nlp_pipeline.pos_tag(sentence_tokens)
                for t in sentence_tokens:
                    t.word = t.word.lower()
                word_tokens += sentence_tokens

        self.tokens = word_tokens
 def _initialize_with_text(self, text):
     nlp_pipeline = NlpPipeline()
     self.tokens = nlp_pipeline.parse_text(text)
示例#6
0
    def __init__(self, text):
        self.text = text

        self.nlp_pipeline = NlpPipeline()
        self.gold_tokens = self.nlp_pipeline.parse_text(self.text)
示例#7
0
 def __init__(self, filename):
     super(PlaintextParser, self).__init__(filename)
     if not self.wants_this_file():
         return
     self._init_line_count_progress()
     self.nlp_pipeline = NlpPipeline()