def analyze(self, text): """ Run text through the external process, and get a list of lists ("records") that contain the analysis of each word. """ try: text = render_safe(text).strip() if not text: return [] chunks = text.split('\n') results = [] for chunk_text in chunks: if chunk_text.strip(): textbytes = (chunk_text + '\n').encode('utf-8') self.send_input(textbytes) out_line = '' while True: out_line = self.receive_output_line() out_line = out_line.decode('utf-8') if out_line == '\n': break record = out_line.strip('\n').split(' ') results.append(record) return results except ProcessError: self.restart_process() return self.analyze(text)
def analyze(self, text): """ Runs a line of text through MeCab, and returns the results as a list of lists ("records") that contain the MeCab analysis of each word. """ try: self.process # make sure things are loaded text = render_safe(text).replace('\n', ' ').lower() n_chunks = (len(text) + 1024) // 1024 results = [] for chunk in range(n_chunks): chunk_text = text[chunk * 1024:(chunk + 1) * 1024] chunk_text = (chunk_text + '\n').encode('utf-8') self.send_input(chunk_text) out_line = '' while True: out_line = self.receive_output_line() out_line = out_line.decode('utf-8') if out_line == 'EOS\n': break word, info = out_line.strip('\n').split('\t') record_parts = [word] + info.split(',') # Pad the record out to have 10 parts if it doesn't record_parts += [None] * (10 - len(record_parts)) record = MeCabRecord(*record_parts) # special case for detecting nai -> n if (record.surface == 'ん' and record.conjugation == '不変化型'): # rebuild the record so that record.root is 'nai' record_parts[MeCabRecord._fields.index('root')] = 'ない' record = MeCabRecord(*record_parts) results.append(record) return results except ProcessError: self.restart_process() return self.analyze(text)