示例#1
0
def make_train_text(model, use_wakatigaki):
    input_text = open(os.path.join(model.prepared_file_path, 'input.txt'), 'w')
    if use_wakatigaki:
        logger.info('Use wakatigaki option.')
        import MeCab
        none = None
        m = MeCab.Tagger("-Owakati")
        for f in ds_utils.find_all_files(model.dataset.dataset_path):
            raw_text = open(f, 'r').read()
            encoding = nkf.guess(raw_text)
            if encoding == 'BINARY':
                continue
            text = raw_text.decode(encoding, 'ignore')
            text = text.replace('\r', '')
            encoded_text = text.encode('UTF-8')
            lines = encoded_text.splitlines()
            for line in lines:
                result = m.parse(line)
                if isinstance(none, type(result)):
                    continue
                input_text.write(result)
                input_text.flush()
    else:
        for f in ds_utils.find_all_files(model.dataset.dataset_path):
            temp_text = open(f, 'r').read()
            encoding = nkf.guess(temp_text)
            if encoding == 'BINARY':
                continue
            decoded_text = temp_text.decode(encoding, 'ignore')
            decoded_text = decoded_text.replace('\r', '')
            encoded_text = decoded_text.encode('UTF-8')
            input_text.write(encoded_text)
            input_text.flush()
    input_text.close()
    return os.path.join(model.prepared_file_path, 'input.txt')
示例#2
0
 def save_uploaded_file_to_category(self, uploaded_file, category):
     filename = uploaded_file.filename
     name, ext = os.path.splitext(filename)
     ext = ext.lower()
     if self.type == 'image':
         if ext not in ('.jpg', '.jpeg', '.png', '.gif'):
             raise ValueError('Invalid file type.')
     elif self.type == 'text':
         if ext not in ('.txt', ):
             raise ValueError('Invalid file type.')
     new_filename = os.path.join(
         self.dataset_path, category,
         ds_util.get_timestamp() + '_' + secure_filename(filename))
     if self.type == 'image':
         uploaded_file.save(new_filename)
     elif self.type == 'text':
         text = uploaded_file.stream.read()
         if nkf.guess(text) == 'binary':
             raise ValueError(
                 'Invalid file type. File must be a text file.')
         f = open(new_filename, 'w')
         f.write(text)
         f.close()
     self.file_num += 1
     self.update_and_commit()
示例#3
0
def get_text_sample(path, character_num=-1):
    raw_text = open(path).read()
    encoding = nkf.guess(raw_text)
    text = raw_text.decode(encoding)
    if character_num > -1:
        return text[0:character_num]
    else:
        return text
示例#4
0
def decode(text, encoding=None, *args):
    if not encoding or encoding in ('ISO-8859-1', 'iso-8859-1'):
        encoding = nkf.guess(text)
        if encoding in ('BINARY', 'ISO-8859-1'):
            encoding = 'utf8'
    encoding = normalize_encoding(encoding)
    if not encoding in all_encodings:
        return nkf.nkf('-w', text).decode('utf8')
    return text.decode(encoding, *args)
示例#5
0
文件: core.py 项目: kirin123kirin/cmd
 def getencoding(dat:bytes):
     if b"\0" in dat:
         return None
     enc = nkf.guess(dat).lower()
     if enc and enc == "shift_jis":
         return "cp932"
     elif enc == "binary":
         return None
     else:
         return enc
#!/usr/bin/env python3
# encoding: utf-8

(print) # this is a python3 script. not python2.

import nkf # NKF wrapper for python3. See; http://sourceforge.jp/projects/nkf/scm/git/nkf/tree/master

try: #  BeautifulSoup,The best HTML parser python,is available to python3 after 2to3.
    from bs4 import BeautifulSoup as BSoup
except:
    from BeautifulSoup import BeautifulSoup as BSoup
import urllib.request
from sys import argv

AURI="http://osu.ppy.sh/pages/include/profile-general.php?u=1679287"
if len(argv) <= 1:
    URI=AURI
else:
    URI=argv[1]
bHtml = urllib.request.urlopen(URI).read()
charset = nkf.guess(bHtml)
sHtml = bHtml.decode(charset)
print(BSoup(sHtml).prettify().decode())