def fixEncodingError(self, text): # try to fix # - -> ー # ~ -> ~ fromChars = '-~〝〟' toChars = 'ー~""' if isPy2(): table = self.maketransU(fromChars, toChars) pos = 0 while pos < len(text): ch = table.get(ord(text[pos]), None) if ch is not None: text = text[:pos] + unichr(ch) + text[pos+1:] pos += 1 return text else: table = text_type.maketrans(fromChars, toChars) text = text.translate(table) return text # detect errors try: bytearray(text, 'euc-jp') except UnicodeEncodeError as u: raise RuntimeError(text + ': ' + str(e)) # ignore if False: b = bytearray(text, 'euc-jp', 'ignore') return text_type(b)
def getPartOfSpeech(): runner = MecabRunner('%m,%f[7]') res = runner.run('雨が降っていたん') #res = runner.run('海泡石') for line in res: if not isPy2(): print(''.join(line))
def dumpNodeInfo(): runner = MecabOutputGetter() #z = bytearray('-・', 'euc-jp', "ignore") res = runner.run('雨が降っていたん') # res = runner.run('すべてに滲《し》み込み') for line in res: if not isPy2(): print(' '.join(line))
def getReadingAndDefinition(self, word): c = self.__conn.cursor() if isPy2(): word = word.encode('utf-8') c.execute("select kana, entry from dict where kanji=:what order by kanji", {"what": word}) result = c.fetchone() if result: return result[0], result[1] else: return None, None
def getUniqueCSVList(textProc, contents, deckFileName, tag): if deckFileName: deck = DeckWords(deckFileName) else: deck = None if tag is None: tag = '' allWords = set() for word, startPos, reading, definition, sentence in textProc.do(contents, Settings.NoExcessiveReading(), True): if word in allWords or not definition or deck and deck.isInDeck(word): continue else: allWords.add(word) line = text_type('"{0:}";"{1:}";"{2:}";"{3}";"{4}"').format(word, reading, definition,sentence, tag) if isPy2(): print(line.encode('utf-8')) else: print(line)
def main(): parser = argparse.ArgumentParser(description='Get the list word in the text.') parser.add_argument('inputfile', metavar='input file name', help='input file name') parser.add_argument('-d', metavar='deck file name', required=False, help='deck file nime') parser.add_argument('-t', metavar='tag', required=False, help='optional tag appended to the list') parser.add_argument('-o', metavar='output file name', required=False, help='output file name') args = parser.parse_args() if args.o: sys.stdout = open(args.o, 'w', encoding='utf-8') setupLogger() with openInputFile(args.inputfile) as file: contents = file.read() if isPy2(): contents = unicode(contents, 'utf-8') textProc = TextProcessor(getDataLoader()) getUniqueCSVList(textProc, contents, args.d, args.t)
def openOutputFile(fileName): if isPy2(): return open(fileName, 'w') else: return open(fileName, 'w', encoding='utf-8')
def openInputFile(fileName): if isPy2(): return open(fileName, 'r') else: return open(fileName, 'r', encoding='utf-8')