def compareOnFile(self, fileName, encoding, resAccumulator): helper = Helper() writer = Writer() runner = MecabOutputGetter() lineNum = 1 for line in self.readFile(fileName, encoding, resAccumulator): text = line.strip() #if isPy2(): # text = text_type(text) if encoding == 'utf-8': text = helper.fixEncodingError(text) nodes = self.viterbi.getBestPath(text) pyResult = writer.getMecabOutput(self.viterbi.getTokenizer(), nodes) try: #runner = MecabOutputGetter() mecabResult = runner.run(text) except IOError as e: resAccumulator.print(text_type(e)) continue try: self.assertEqual(len(mecabResult), len(pyResult), text + '\npyPort:\n' + helper.outputNodes(pyResult) + '\nmecab:\n' + helper.outputNodes(mecabResult)) for i in range(len(mecabResult)): self.assertEqual(mecabResult[i], pyResult[i], "at line " + str(lineNum) + ": '" + line + "'") except AssertionError as e: resAccumulator.print(text_type(e)) lineNum += 1 if lineNum % 500 == 0: resAccumulator.print(text_type(lineNum) + ' lines have been processed') resAccumulator.print(text_type(lineNum) + ' lines have been processed')
def fixEncodingError(self, text): # try to fix # - -> ー # ~ -> ~ fromChars = '-~〝〟' toChars = 'ー~""' if isPy2(): table = self.maketransU(fromChars, toChars) pos = 0 while pos < len(text): ch = table.get(ord(text[pos]), None) if ch is not None: text = text[:pos] + unichr(ch) + text[pos+1:] pos += 1 return text else: table = text_type.maketrans(fromChars, toChars) text = text.translate(table) return text # detect errors try: bytearray(text, 'euc-jp') except UnicodeEncodeError as u: raise RuntimeError(text + ': ' + str(e)) # ignore if False: b = bytearray(text, 'euc-jp', 'ignore') return text_type(b)
def getAllReadingAndDefinition(self, word): assert(len(word)) entries = [] offsets = self.lookupDict.exactMatchSearch(bytearray(word, 'utf-8')) for tokenHandler, tokenLength in offsets: entryNum = tokenHandler & 0xff entryOffsetStartPos = tokenHandler >> 8 for i in range(entryNum): offset = self.getEntryOffset(entryOffsetStartPos + i) entry = self.getEntry(offset) (kanji, kana, text) = entry.split(b'\x01') kanji = text_type(kanji, 'utf-8') kana = text_type(kana, 'utf-8') text = text_type(text, 'utf-8') entries.append((kana, text)) return entries
def run(self, expr): self.ensureOpen() expr += '\n' self.mecab.stdin.write(expr.encode("euc-jp", "ignore")) self.mecab.stdin.flush() exprFromMecab = text_type(self.mecab.stdout.readline(), "euc-jp") exprFromMecab = exprFromMecab.rstrip('\r\n') return exprFromMecab.split(self.lineDelimiter)[:-1]
def getFeature(self, featureId): """ Gets the dictionary entry for the word """ strEnd = self.featureBlob.find(b'\x00', featureId) if strEnd >= 0: feature = text_type(self.featureBlob[featureId:strEnd], self.getCharSet()) return feature else: return None
def dryBurn(): from pkgutil import iter_modules a=iter_modules() while True: try: x=a.next() except: break print (x[1], '<br>') setupLogger() contents = '船が検疫所に着いたのは' textProc = TextProcessor(getDataLoader()) for word, reading, definition, sentence in textProc.do(contents, Settings.NoExcessiveReading(), True): line = text_type('{0:<10} {1:<10} {2:<10} {3}\n').format(word, reading, definition,sentence) line = line.strip('\n') print(line.encode('utf-8'))
def readFile(self, fileName, encoding, resAccumulator): with io.open(fileName, 'rb') as inFile: contents = inFile.read() contents = contents.split(b'\r\n') lineNum = 1 encodingError = 0 resAccumulator.print(fileName) for line in contents: try: text = line.strip() text = text_type(line, encoding) lineNum += 1 yield text except UnicodeDecodeError as u: encodingError += 1 resAccumulator.print('line {0}, pos {1}: encoding error'.format(lineNum, u.start))
def getUniqueCSVList(textProc, contents, deckFileName, tag): if deckFileName: deck = DeckWords(deckFileName) else: deck = None if tag is None: tag = '' allWords = set() for word, startPos, reading, definition, sentence in textProc.do(contents, Settings.NoExcessiveReading(), True): if word in allWords or not definition or deck and deck.isInDeck(word): continue else: allWords.add(word) line = text_type('"{0:}";"{1:}";"{2:}";"{3}";"{4}"').format(word, reading, definition,sentence, tag) if isPy2(): print(line.encode('utf-8')) else: print(line)
def internalSearch(self, encodedText, functionToMatch): tokens = [] ## try: ## encodedText = bytearray(text, self.getCharSet()) ## except UnicodeEncodeError as e: ## z = e.start ## raise RuntimeError(text_type(text) + ': ' + str(e)) tokenStartIds = functionToMatch(encodedText) for tokenHandler, tokenLength in tokenStartIds: tokenNum = tokenHandler & 0xff tokenStartId = tokenHandler >> 8 for i in range(tokenNum): d = self.getToken(tokenStartId + i) tokenText = text_type(bytes(encodedText[:tokenLength]), self.getCharSet()) t = Token(tokenText, d.leftAttribute, d.rightAttribute, d.partOfSpeechId, d.wordCost, d.featureId, d.compound) tokens.append(t) return tokens
def print(self, line): self.file.write(text_type(line)) self.file.write('\n')