示例#1
0
class PlaintextParser(AbstractParser):
    def __init__(self, filename):
        super(PlaintextParser, self).__init__(filename)
        if not self.wants_this_file():
            return
        self._init_line_count_progress()
        self.nlp_pipeline = NlpPipeline()

    def _wanted_file_endings(self):
        return (".txt", )

    def parse(self):
        text = Text()

        with open(self.filename, "r") as file_:
            for line_unenc in file_:
                self._progress += 1
                line = unicode(line_unenc.encode('utf8'))
                if line.startswith(TEXT_SEPARATOR):
                    if (len(text.sentences) > 0):
                        yield text
                        text = Text()
                        continue
                sentences = self.nlp_pipeline.sentence_segmentation(line)
                for sentence in sentences:
                    s = Sentence()
                    s.set_sentence_text(sentence)
                    s.set_tokens(self.nlp_pipeline.parse_text(sentence))
                    text.add_sentence(s)
        if (len(text.sentences) > 0):
            yield text

    def progress(self):
        return self._line_count_progress()
示例#2
0
 def __init__(self, filename):
     super(XMLParser, self).__init__(filename)
     if not self.wants_this_file():
         return
     self.nlp_pipeline = NlpPipeline()
     self._linenumber = self._count_docs()
     self._progress = 0
class PlaintextParser(AbstractParser):
    def __init__(self, filename):
        super(PlaintextParser, self).__init__(filename)
        if not self.wants_this_file():
            return
        self._init_line_count_progress()
        self.nlp_pipeline = NlpPipeline()

    def _wanted_file_endings(self):
        return (".txt",)

    def parse(self):
        text = Text()

        with open(self.filename, "r") as file_:
            for line_unenc in file_:
                self._progress += 1
                line = unicode(line_unenc.encode('utf8'))
                if line.startswith(TEXT_SEPARATOR):
                    if (len(text.sentences) > 0):
                        yield text
                        text = Text()
                        continue
                sentences = self.nlp_pipeline.sentence_segmentation(line)
                for sentence in sentences:
                    s = Sentence()
                    s.set_sentence_text(sentence)
                    s.set_tokens(self.nlp_pipeline.parse_text(sentence))
                    text.add_sentence(s)
        if (len(text.sentences) > 0):
            yield text

    def progress(self):
        return self._line_count_progress()
    def _initialize_with_tokens(self, tokens):
        # convert tokens to WordTokens
        word_tokens = [ WordToken(token) for token in tokens ]

        # do pos_tagging if needed
        if sbd.config.getboolean('features', 'pos_tagging'):
            nlp_pipeline = NlpPipeline()
            nlp_pipeline.pos_tag(wordTokens)

        self.tokens = word_tokens
    def _initialize_with_tokens(self, tokens):
        # convert tokens to WordTokens
        word_tokens = [WordToken(token) for token in tokens]

        # do pos_tagging if needed
        if sbd.config.getboolean('features', 'pos_tagging'):
            nlp_pipeline = NlpPipeline()
            nlp_pipeline.pos_tag(wordTokens)

        self.tokens = word_tokens
    def __init__(self, filename):
        super(LineParser, self).__init__(filename)
        if not self.wants_this_file():
            return
            
        self._init_line_count_progress()
        # if sbd.config.getboolean('features', 'use_question_mark'):
        #     raise ValueError("Question marks not supported by LineParser")

        self.POS_TAGGING = sbd.config.getboolean('features', 'pos_tagging')
        self.nlp_pipeline = NlpPipeline()
    def _initialize_with_talks(self, talks):
        nlp_pipeline = NlpPipeline()
        word_tokens = []

        for talk in talks:
            for sentence in talk.sentences:
                sentence_tokens = []
                # get all word tokens
                for token in sentence.tokens:
                    if not token.is_punctuation():
                        sentence_tokens.append(WordToken(token.word))
                # do pos_tagging if needed on sentence level
                if sbd.config.getboolean('features', 'pos_tagging'):
                    nlp_pipeline.pos_tag(sentence_tokens)
                for t in sentence_tokens:
                    t.word = t.word.lower()
                word_tokens += sentence_tokens

        self.tokens = word_tokens
    def _initialize_with_talks(self, talks):
        nlp_pipeline = NlpPipeline()
        word_tokens = []

        for talk in talks:
            for sentence in talk.sentences:
                sentence_tokens = []
                # get all word tokens
                for token in sentence.tokens:
                    if not token.is_punctuation():
                        sentence_tokens.append(WordToken(token.word))
                # do pos_tagging if needed on sentence level
                if sbd.config.getboolean('features', 'pos_tagging'):
                    nlp_pipeline.pos_tag(sentence_tokens)
                for t in sentence_tokens:
                    t.word = t.word.lower()
                word_tokens += sentence_tokens

        self.tokens = word_tokens
示例#9
0
class InputText(object):

    def __init__(self, text):
        self.text = text

        self.nlp_pipeline = NlpPipeline()
        self.gold_tokens = self.nlp_pipeline.parse_text(self.text)

    def get_gold_tokens(self):
        return self.gold_tokens
示例#10
0
class XMLParser(AbstractParser):
    def __init__(self, filename):
        super(XMLParser, self).__init__(filename)
        if not self.wants_this_file():
            return
        self.nlp_pipeline = NlpPipeline()
        self._linenumber = self._count_docs()
        self._progress = 0

    def _wanted_file_endings(self):
        return (".xml", )

    def parse(self):
        mteval = xml.etree.ElementTree.parse(self.filename).getroot()
        srcset = mteval.find("srcset")
        for doc in srcset.findall('doc'):
            self._progress += 1
            talk = Text()

            for sentence in doc.findall("seg"):
                sentence_text = unicode(sentence.text)

                sentence = Sentence()
                sentence.set_sentence_text(sentence_text)
                sentence.set_tokens(
                    self.nlp_pipeline.parse_text(sentence_text))
                talk.add_sentence(sentence)

            yield talk

    def progress(self):
        return self._line_count_progress()

    def _count_docs(self):
        mteval = xml.etree.ElementTree.parse(self.filename).getroot()
        srcset = mteval.find("srcset")
        i = 0
        for doc in srcset.findall('doc'):
            i += 1
        return i
 def _initialize_with_text(self, text):
     nlp_pipeline = NlpPipeline()
     self.tokens = nlp_pipeline.parse_text(text)
 def _initialize_with_text(self, text):
     nlp_pipeline = NlpPipeline()
     self.tokens = nlp_pipeline.parse_text(text)
class LineParser(AbstractParser):

    def __init__(self, filename):
        super(LineParser, self).__init__(filename)
        if not self.wants_this_file():
            return
            
        self._init_line_count_progress()
        # if sbd.config.getboolean('features', 'use_question_mark'):
        #     raise ValueError("Question marks not supported by LineParser")

        self.POS_TAGGING = sbd.config.getboolean('features', 'pos_tagging')
        self.nlp_pipeline = NlpPipeline()

    def _wanted_file_endings(self):
        return (".line", )

    def parse(self):
        with open(self.filename, "r") as file_:
            text = Text()
            sentence = Sentence()
            sentence.tokens = []

            for line_unenc in file_:
                # end of a text reached
                if line_unenc.rstrip() == END_OF_TEXT_MARKER:
                    yield text
                    text = Text()
                    continue

                self._progress += 1

                # parse line
                line = unicode(line_unenc, errors='ignore')
                line = line.rstrip()

                # split line into word, pos_tags and type
                line_parts = line.split('\t')
                word = self._get_word(line_parts)
                if word is None:
                    continue
                pos_tags = self._get_pos_tags(line_parts)
                punctuation = self._get_punctuation(line_parts)

                sentence.tokens.extend(self._create_tokens(word, pos_tags, punctuation))

                # we are at the end of a sentence
                if punctuation == 'PERIOD':
                    if self.POS_TAGGING and not pos_tags:
                        self.nlp_pipeline.pos_tag(sentence.tokens)
                    text.add_sentence(sentence)
                    sentence = Sentence()
                    sentence.tokens = []

        # if we do not have any end-of-text-marker
        # return everything as one text
        if len(text.sentences) > 0:
            yield text

    def _get_word(self, line_parts):
        word = unicode(line_parts[0])
        word = self.nlp_pipeline.process_word(word)
        # check if needed
        # if "?" in word and len(word) > 0:
        #     word = word.replace("?", "")
        return word

    def _get_punctuation(self, line_parts):
        if len(line_parts) == 2:
            return unicode(line_parts[1])
        else:
            return unicode(line_parts[2])

    def _get_pos_tags(self, line_parts):
        if len(line_parts) == 2:
            return set()
        else:
            pos_tag_str = line_parts[1].split(",")
            pos_tag_types = map(lambda x: x.split(".")[1], pos_tag_str)
            return set(map(lambda x: PosTag[x], pos_tag_types))

    def progress(self):
        return self._line_count_progress()
 
    def _create_tokens(self, word, pos_tags, punctuation):
        word_token = WordToken(word)
        word_token.set_pos_tags(pos_tags)
        
        punctuation_token = None
        if punctuation == 'PERIOD':
            punctuation_token = PunctuationToken(punctuation, Punctuation.PERIOD)
        elif punctuation == 'COMMA':
            punctuation_token = PunctuationToken(punctuation, Punctuation.COMMA)

        if punctuation_token is not None:
            return [word_token, punctuation_token]
        return [word_token]
示例#14
0
    def __init__(self, text):
        self.text = text

        self.nlp_pipeline = NlpPipeline()
        self.gold_tokens = self.nlp_pipeline.parse_text(self.text)
示例#15
0
 def __init__(self, filename):
     super(PlaintextParser, self).__init__(filename)
     if not self.wants_this_file():
         return
     self._init_line_count_progress()
     self.nlp_pipeline = NlpPipeline()
 def __init__(self, filename):
     super(PlaintextParser, self).__init__(filename)
     if not self.wants_this_file():
         return
     self._init_line_count_progress()
     self.nlp_pipeline = NlpPipeline()