Пример #1
0
    def compareOnFile(self, fileName, encoding, resAccumulator):
        helper = Helper()
        writer = Writer()
        runner = MecabOutputGetter()
        lineNum = 1
        for line in self.readFile(fileName, encoding, resAccumulator):
                text = line.strip()
                #if isPy2():
                #    text = text_type(text)
                if encoding == 'utf-8':
                    text = helper.fixEncodingError(text)
                nodes = self.viterbi.getBestPath(text)

                pyResult = writer.getMecabOutput(self.viterbi.getTokenizer(), nodes)
                try:
                    #runner = MecabOutputGetter()
                    mecabResult = runner.run(text)
                except IOError as e:
                    resAccumulator.print(text_type(e))
                    continue
                try:

                    self.assertEqual(len(mecabResult), len(pyResult),
                        text + '\npyPort:\n' + helper.outputNodes(pyResult) +
                         '\nmecab:\n' + helper.outputNodes(mecabResult))
                    for i in range(len(mecabResult)):
                        self.assertEqual(mecabResult[i], pyResult[i], "at line " + str(lineNum) + ": '" + line + "'")
                except AssertionError as e:
                    resAccumulator.print(text_type(e))
                lineNum += 1
                if lineNum % 500 == 0:
                    resAccumulator.print(text_type(lineNum) + ' lines have been processed')
        resAccumulator.print(text_type(lineNum) + ' lines have been processed')
Пример #2
0
 def fixEncodingError(self, text):
     # try to fix
     # -  -> ー
     # ~  ->  ~
     fromChars = '-~〝〟'
     toChars = 'ー~""'
     if isPy2():
         table = self.maketransU(fromChars, toChars)
         pos = 0
         while pos < len(text):
             ch = table.get(ord(text[pos]), None)
             if ch is not None:
                 text = text[:pos] + unichr(ch) + text[pos+1:]
             pos += 1
         return text
     else:
         table = text_type.maketrans(fromChars, toChars)
         text = text.translate(table)
     return text
     # detect errors
     try:
         bytearray(text, 'euc-jp')
     except UnicodeEncodeError as u:
         raise RuntimeError(text + ': ' + str(e))
     # ignore
     if False:
         b = bytearray(text, 'euc-jp', 'ignore')
         return text_type(b)
Пример #3
0
 def getAllReadingAndDefinition(self, word):
     assert(len(word))
     entries = []
     offsets = self.lookupDict.exactMatchSearch(bytearray(word, 'utf-8'))
     for tokenHandler, tokenLength in offsets:
         entryNum = tokenHandler & 0xff
         entryOffsetStartPos = tokenHandler >> 8
         for i in range(entryNum):
             offset = self.getEntryOffset(entryOffsetStartPos + i)
             entry = self.getEntry(offset)
             (kanji, kana, text) = entry.split(b'\x01')
             kanji = text_type(kanji, 'utf-8')
             kana = text_type(kana, 'utf-8')
             text = text_type(text, 'utf-8')
             entries.append((kana, text))
     return entries
Пример #4
0
 def run(self, expr):
     self.ensureOpen()
     expr += '\n'
     self.mecab.stdin.write(expr.encode("euc-jp", "ignore"))
     self.mecab.stdin.flush()
     exprFromMecab = text_type(self.mecab.stdout.readline(), "euc-jp")
     exprFromMecab = exprFromMecab.rstrip('\r\n')
     return exprFromMecab.split(self.lineDelimiter)[:-1]
Пример #5
0
 def getFeature(self, featureId):
     """
         Gets the dictionary entry for the word
     """
     strEnd = self.featureBlob.find(b'\x00', featureId)
     if strEnd >= 0:
         feature = text_type(self.featureBlob[featureId:strEnd], self.getCharSet())
         return feature
     else:
         return None
Пример #6
0
def dryBurn():
    from pkgutil import iter_modules
    a=iter_modules()
    while True:
        try: x=a.next()
        except: break
        print (x[1], '<br>')

    setupLogger()
    contents = '船が検疫所に着いたのは'
    textProc = TextProcessor(getDataLoader())
    for word, reading, definition, sentence in textProc.do(contents, Settings.NoExcessiveReading(), True):
        line = text_type('{0:<10}  {1:<10}  {2:<10}  {3}\n').format(word, reading, definition,sentence)
        line = line.strip('\n')
        print(line.encode('utf-8'))
Пример #7
0
 def readFile(self, fileName, encoding, resAccumulator):
     with io.open(fileName, 'rb') as inFile:
         contents = inFile.read()
     contents = contents.split(b'\r\n')
     lineNum = 1
     encodingError = 0
     resAccumulator.print(fileName)
     for line in contents:
         try:
             text = line.strip()
             text = text_type(line, encoding)
             lineNum += 1
             yield text
         except UnicodeDecodeError as u:
             encodingError += 1
             resAccumulator.print('line {0}, pos {1}: encoding error'.format(lineNum, u.start))
Пример #8
0
def getUniqueCSVList(textProc, contents, deckFileName, tag):
    if deckFileName:
        deck = DeckWords(deckFileName)
    else:
        deck = None
    if tag is None:
        tag = ''
    allWords = set()
    for word, startPos, reading, definition, sentence in textProc.do(contents, Settings.NoExcessiveReading(), True):
        if word in allWords or not definition  or deck and deck.isInDeck(word):
            continue
        else:
            allWords.add(word)
        line = text_type('"{0:}";"{1:}";"{2:}";"{3}";"{4}"').format(word, reading, definition,sentence, tag)
        if isPy2():
            print(line.encode('utf-8'))
        else:
            print(line)
Пример #9
0
    def internalSearch(self, encodedText, functionToMatch):
        tokens = []
##        try:
##            encodedText = bytearray(text, self.getCharSet())
##        except UnicodeEncodeError as e:
##            z = e.start
##            raise RuntimeError(text_type(text) + ': ' + str(e))
        tokenStartIds = functionToMatch(encodedText)
        for tokenHandler, tokenLength in tokenStartIds:
            tokenNum = tokenHandler & 0xff
            tokenStartId = tokenHandler >> 8
            for i in range(tokenNum):
                d = self.getToken(tokenStartId + i)
                tokenText = text_type(bytes(encodedText[:tokenLength]), self.getCharSet())
                t = Token(tokenText, d.leftAttribute,
                          d.rightAttribute, d.partOfSpeechId,
                          d.wordCost, d.featureId, d.compound)
                tokens.append(t)
        return tokens
Пример #10
0
 def print(self, line):
     self.file.write(text_type(line))
     self.file.write('\n')