def make_train_text(model, use_wakatigaki): input_text = open(os.path.join(model.prepared_file_path, 'input.txt'), 'w') if use_wakatigaki: logger.info('Use wakatigaki option.') import MeCab none = None m = MeCab.Tagger("-Owakati") for f in ds_utils.find_all_files(model.dataset.dataset_path): raw_text = open(f, 'r').read() encoding = nkf.guess(raw_text) if encoding == 'BINARY': continue text = raw_text.decode(encoding, 'ignore') text = text.replace('\r', '') encoded_text = text.encode('UTF-8') lines = encoded_text.splitlines() for line in lines: result = m.parse(line) if isinstance(none, type(result)): continue input_text.write(result) input_text.flush() else: for f in ds_utils.find_all_files(model.dataset.dataset_path): temp_text = open(f, 'r').read() encoding = nkf.guess(temp_text) if encoding == 'BINARY': continue decoded_text = temp_text.decode(encoding, 'ignore') decoded_text = decoded_text.replace('\r', '') encoded_text = decoded_text.encode('UTF-8') input_text.write(encoded_text) input_text.flush() input_text.close() return os.path.join(model.prepared_file_path, 'input.txt')
def save_uploaded_file_to_category(self, uploaded_file, category): filename = uploaded_file.filename name, ext = os.path.splitext(filename) ext = ext.lower() if self.type == 'image': if ext not in ('.jpg', '.jpeg', '.png', '.gif'): raise ValueError('Invalid file type.') elif self.type == 'text': if ext not in ('.txt', ): raise ValueError('Invalid file type.') new_filename = os.path.join( self.dataset_path, category, ds_util.get_timestamp() + '_' + secure_filename(filename)) if self.type == 'image': uploaded_file.save(new_filename) elif self.type == 'text': text = uploaded_file.stream.read() if nkf.guess(text) == 'binary': raise ValueError( 'Invalid file type. File must be a text file.') f = open(new_filename, 'w') f.write(text) f.close() self.file_num += 1 self.update_and_commit()
def get_text_sample(path, character_num=-1): raw_text = open(path).read() encoding = nkf.guess(raw_text) text = raw_text.decode(encoding) if character_num > -1: return text[0:character_num] else: return text
def decode(text, encoding=None, *args): if not encoding or encoding in ('ISO-8859-1', 'iso-8859-1'): encoding = nkf.guess(text) if encoding in ('BINARY', 'ISO-8859-1'): encoding = 'utf8' encoding = normalize_encoding(encoding) if not encoding in all_encodings: return nkf.nkf('-w', text).decode('utf8') return text.decode(encoding, *args)
def getencoding(dat:bytes): if b"\0" in dat: return None enc = nkf.guess(dat).lower() if enc and enc == "shift_jis": return "cp932" elif enc == "binary": return None else: return enc
#!/usr/bin/env python3 # encoding: utf-8 (print) # this is a python3 script. not python2. import nkf # NKF wrapper for python3. See; http://sourceforge.jp/projects/nkf/scm/git/nkf/tree/master try: # BeautifulSoup,The best HTML parser python,is available to python3 after 2to3. from bs4 import BeautifulSoup as BSoup except: from BeautifulSoup import BeautifulSoup as BSoup import urllib.request from sys import argv AURI="http://osu.ppy.sh/pages/include/profile-general.php?u=1679287" if len(argv) <= 1: URI=AURI else: URI=argv[1] bHtml = urllib.request.urlopen(URI).read() charset = nkf.guess(bHtml) sHtml = bHtml.decode(charset) print(BSoup(sHtml).prettify().decode())