예제 #1
0
class CorpusReader(BaseReader):
    def __init__(self, token_reader: BaseReader, linesep: str=os.linesep):
        super().__init__(linesep=linesep)
        self.token_reader = token_reader
        self.sentence_parser = SentenceReader(self.token_reader)

    def read(self, text: str):
        # it parses the whole(!) analysed corpus
        sentences = list()
        for line in text.split(self.linesep):
            if len(line) > 0:
                sentences.append(self.sentence_parser.read(line))
        paragraph = Paragraph(sentences)
        document = Document()
        document.append(paragraph)
        return document
예제 #2
0
class HunPosCorpusReader(BaseReader):
    # Ugyan olyan reader, mint a CorpusReader, csak más a kódolás és a szeparátor.
    # Célszerű lenne úgy refaktorálni, hogy egy paraméterezhető Corpusreader legyen.
    def __init__(self):
        super().__init__(encoding="ISO-8859-2")
        self.word_parser = TaggedTokenReader("\t")
        self.sentence_parser = SentenceReader(self.word_parser, self.linesep)

    def read(self, text: str):
        sentences = list()
        for sent in text.split(self.linesep + self.linesep):
            if len(sent)-1 > 0:
                sentences.append(self.sentence_parser.read(sent))
        paragraph = Paragraph(sentences)
        document = Document()
        document.append(paragraph)
        return document
예제 #3
0
 def __init__(self):
     super().__init__(encoding="ISO-8859-2")
     self.word_parser = TaggedTokenReader("\t")
     self.sentence_parser = SentenceReader(self.word_parser, self.linesep)
예제 #4
0
 def __init__(self, token_reader: BaseReader, linesep: str=os.linesep):
     super().__init__(linesep=linesep)
     self.token_reader = token_reader
     self.sentence_parser = SentenceReader(self.token_reader)