예제 #1
0
파일: parser.py 프로젝트: tyeth/email2slack
    def parse_header(parsed_mail, field):
        # type: (List[str], str) -> str
        decoded = []
        raw_header = parsed_mail.get(field, '')
        # decode_header does not work well in some case,
        # eg. FW: =?ISO-2022-JP?B?GyRCR1s/LklURz0bKEI=?=:
        chunks = re.split(r'(=\?[^?]+\?[BQ]\?[^?]+\?=)',
                          re.sub(r'\r?\n\s+', ' ', raw_header))
        i = 0
        while i < len(chunks):
            if chunks[i].startswith('=?') and chunks[i].endswith('?=') and \
                            i < len(chunks) - 2 and \
                            chunks[i + 1] == ' ' and \
                    chunks[i + 2].startswith('=?') and chunks[i + 2].endswith('?='):
                del (chunks[i + 1])
            i += 1

        for chunk in chunks:
            if chunk.find('=?') >= 0:
                for decoded_chunk, charset in decode_header(chunk):
                    if charset:
                        if charset == 'ISO-2022-JP':
                            if callable(nkf):
                                decoded_chunk = nkf('-Jw', decoded_chunk)
                                charset = 'utf-8'
                            else:
                                charset = 'ISO-2022-JP-2004'
                                decoded_chunk = decoded_chunk \
                                    .replace(b'\033$B', b'\033$(Q') \
                                    .replace(b'\033(J', b'\033(B')
                        elif charset == 'SJIS':
                            if callable(nkf):
                                decoded_chunk = nkf('-Sw', decoded_chunk)
                                charset = 'utf-8'
                            else:
                                charset = 'CP932'
                        elif charset == 'EUC-JP':
                            if callable(nkf):
                                decoded_chunk = nkf('-Ew', decoded_chunk)
                                charset = 'utf-8'
                            else:
                                charset = 'EUCJIS2004'
                        try:
                            decoded_chunk = decoded_chunk.decode(
                                charset, errors='replace')
                        except TypeError:
                            pass
                    else:
                        decoded_chunk = decoded_chunk.decode()
                    decoded.append(decoded_chunk)
            elif chunk:
                decoded.append(chunk)
        return ''.join(decoded)
예제 #2
0
def file_read(file_path):
    # ファイルオープン
    contents = open(file_path).read()
    contents = nkf.nkf("-w -d", contents)\
            .decode("utf_8")

    return contents
예제 #3
0
def create_img(name, tenki, kuji, honbun):
  global g_pos_y
  g_pos_y = 0

  img = PIL.Image.new('RGB', (384, 1000), (255,255,255))
  date = datetime.datetime.today().strftime("%Y/%m/%d")

#  draw_text(img, ' ')
  draw_text(img, ' ')
  draw_text(img, date, align='center', size=28)
  draw_text(img, ' ')
  draw_text(img, name + ' 局長殿', align = 'center', size=40)
  draw_text(img, ' ')
  u = tenki.split('、')
  for v in u:
    draw_text(img, v, align = 'center', size = 32)
  draw_text(img, ' ')
  draw_text(img, '================================')
  draw_text(img, 'KDNおみくじ', align='center', size=48)
  draw_text(img, '================================')
  draw_text(img, ' ')
  kuji = kuji.split()
  draw_text(img, kuji[0], align='center', size = 36)
  draw_text(img, ' ')
  draw_text(img, kuji[1], align='center', size = 36)
  draw_text(img, ' ')
  u = nkf.nkf('-f22', honbun).decode('utf-8').splitlines() 
  for v in u:
    draw_text(img, v, size = 32)
  
  gray = img.convert('L')
  img = gray.point(lambda x: 0 if x < 128 else 255)
#  img.show()
  img.save('output.png')
  return img
예제 #4
0
def decode(text, encoding=None, *args):
    if not encoding or encoding in ('ISO-8859-1', 'iso-8859-1'):
        encoding = nkf.guess(text)
        if encoding in ('BINARY', 'ISO-8859-1'):
            encoding = 'utf8'
    encoding = normalize_encoding(encoding)
    if not encoding in all_encodings:
        return nkf.nkf('-w', text).decode('utf8')
    return text.decode(encoding, *args)
예제 #5
0
def process_institution():
    institution_rows = []
    institution_fields = None
    for f in glob.glob("import/catalog/institution*.csv"):
        data = normalize(nkf.nkf("-w", open(f, "rb").read()).decode("UTF-8"))
        for hyphen in [b"\xe2\x80\x90", b"\xe2\x88\x92"]:
            data = data.replace(hyphen.decode("UTF-8"), "-")

        def fetch_fields(reader):
            for row in reader:
                if len(row) > 1:
                    return row
                return None

        if sys.version_info.major < 3:
            data = data.encode("UTF-8")
            readers = [
                csv.reader(io.BytesIO(data)),
                csv.reader(io.BytesIO(data), dialect="excel-tab")
            ]
        else:
            readers = [
                csv.reader(io.StringIO(data)),
                csv.reader(io.StringIO(data), dialect="excel-tab")
            ]

        for rd in readers:
            fields = fetch_fields(rd)
            if fields:
                if institution_fields is None:
                    institution_fields = fields
                else:
                    assert institution_fields == fields

                for r in rd:
                    r = [
                        x.replace("\r\n",
                                  " ").replace("\n", " ").replace("\r",
                                                                  " ").strip()
                        for x in r
                    ]
                    if not is_blank_row(r):
                        institution_rows.append(r)
                break

    ids = [r[0] for r in institution_rows]
    assert len(ids) == len(set(ids))

    outname = "refine/institution.csv"
    if sys.version_info.major < 3:
        out = csv.writer(open(outname, "wb"))
    else:
        out = csv.writer(codecs.open(outname, "wb", encoding="UTF-8"))

    out.writerow(institution_fields)
    [out.writerow(r) for r in institution_rows]
예제 #6
0
파일: parser.py 프로젝트: tyeth/email2slack
    def extract_message(message):
        if message.is_multipart():
            messages = []
            for m in message.get_payload():
                extracted = EmailParser.extract_message(m)
                if extracted:
                    if isinstance(extracted, list):
                        messages.extend(extracted)
                    else:
                        messages.append(extracted)
            return messages

        body = message.get_payload(decode=True)
        if not body:
            return None
        charset = chardet.detect(body)['encoding']
        if charset is None:
            charset = 'utf-8'
        elif charset == 'ISO-2022-JP':
            if callable(nkf):
                body = nkf('-Jwx', body)
                charset = 'utf-8'
            else:
                charset = 'ISO-2022-JP-2004'
                body = body.replace(b'\033$B',
                                    b'\033$(Q').replace(b'\033(J', b'\033(B')
        elif charset == 'SHIFT_JIS':
            if callable(nkf):
                body = nkf('-Swx', body)
                charset = 'utf-8'
            else:
                charset = 'CP932'
        elif charset == 'EUC-JP':
            if callable(nkf):
                body = nkf('-Ew', body)
                charset = 'utf-8'
            else:
                charset = 'EUCJIS2004'

        return message['Content-Type'], body.decode(encoding=charset,
                                                    errors='replace')
예제 #7
0
def process_institution():
	institution_rows = []
	institution_fields = None
	for f in glob.glob("import/catalog/institution*.csv"):
		data = normalize(nkf.nkf("-w", open(f, "rb").read()).decode("UTF-8"))
		for hyphen in [b"\xe2\x80\x90", b"\xe2\x88\x92"]:
			data = data.replace(hyphen.decode("UTF-8"), "-")
		
		def fetch_fields(reader):
			for row in reader:
				if len(row) > 1:
					return row
				return None
		
		if sys.version_info.major < 3:
			data = data.encode("UTF-8")
			readers = [csv.reader(io.BytesIO(data)), csv.reader(io.BytesIO(data), dialect="excel-tab")]
		else:
			readers = [csv.reader(io.StringIO(data)), csv.reader(io.StringIO(data), dialect="excel-tab")]
		
		for rd in readers:
			fields = fetch_fields(rd)
			if fields:
				if institution_fields is None:
					institution_fields = fields
				else:
					assert institution_fields == fields
			
				for r in rd:
					r = [x.replace("\r\n", " ").replace("\n"," ").replace("\r"," ").strip() for x in r]
					if not is_blank_row(r):
						institution_rows.append(r)
				break

	ids = [r[0] for r in institution_rows]
	assert len(ids) == len(set(ids))

	outname = "refine/institution.csv"
	if sys.version_info.major < 3:
		out = csv.writer(open(outname, "wb"))
	else:
		out = csv.writer(codecs.open(outname, "wb", encoding="UTF-8"))
	
	out.writerow(institution_fields)
	[out.writerow(r) for r in institution_rows]
예제 #8
0
    def __init__(
        self, iterable, dialect='excel', error_mode="strict", encoding=None,
        headers=None, *args, **kwargs
    ):
        if isinstance(iterable, io.StringIO):
            assert encoding
            self.encoding = encoding
            iterable = StringIO.StringIO(iterable.read().encode(encoding))
        else:
            # force utf8 with nkf module
            self.encoding = 'utf8'
            iterable = StringIO.StringIO(nkf.nkf('-w', iterable.read()))

        self.headers = headers
        self.reader = headers and \
            csv.reader(iterable, dialect=dialect, *args, **kwargs) or \
            csv.DictReader(iterable, dialect=dialect, *args, **kwargs)
        self.dialect = self.reader.dialect
        self.line_num = 1
        self.error_mode = error_mode
예제 #9
0
    def __init__(self,
                 iterable,
                 dialect='excel',
                 error_mode="strict",
                 encoding=None,
                 headers=None,
                 *args,
                 **kwargs):
        if isinstance(iterable, io.StringIO):
            assert encoding
            self.encoding = encoding
            iterable = StringIO.StringIO(iterable.read().encode(encoding))
        else:
            # force utf8 with nkf module
            self.encoding = 'utf8'
            iterable = StringIO.StringIO(nkf.nkf('-w', iterable.read()))

        self.headers = headers
        self.reader = headers and \
            csv.reader(iterable, dialect=dialect, *args, **kwargs) or \
            csv.DictReader(iterable, dialect=dialect, *args, **kwargs)
        self.dialect = self.reader.dialect
        self.line_num = 1
        self.error_mode = error_mode
예제 #10
0
    def get_pos_detail3(self):
        return self.pos_detail3

# Wordクラスの情報出力関数
def print_word(word):
    print(word.get_surface())
    print(word.get_pos())
    print(word.get_pos_detail1())
    print(word.get_pos_detail2())
    print(word.get_pos_detail3())

# 日本語を標準出力できるように
sys.stdout = codecs.getwriter("utf_8")(sys.stdout)

contents = open("./hoge.txt").read()
contents = nkf.nkf("-w -d", contents)

# 形態素解析する
# 注意:MeCab解析する文字列は必ずencodeされていること.
#       結果は,decodeして使用すること.
# 参考:http://shogo82148.github.io/blog/2012/12/15/mecab-python/
result = MeCab.Tagger("")\
        .parse(contents)\
        .decode("utf-8")

# 形態素に分解して,word_arrに突っ込む
lines = result.split("\n")
pattern = r"^(.*?)\t(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?)$"
doc_arr = []
for line in lines:
    word_arr = []
예제 #11
0
def htmltotext(os_pathname):
    html = open(os_pathname).read()
    html = nkf.nkf("-w", html)
    return elinks(html)
예제 #12
0
def utf8_cleanup(text):
    if isinstance(text, str):
        return nkf.nkf("-w", text)
    #else
    return text.encode("utf-8")
예제 #13
0
 def decode(self, input, errors='strict'):
     ret = nkf.nkf('-m0 -x -J -w', input)
     ret = unicode(ret, 'utf8', 'replace')
     return ret, len(input)
예제 #14
0
def do(filename):
    html = open(filename).read()
    html = nkf.nkf("-w", html)
    return elinks.extract_from_html(html)
예제 #15
0
파일: extract.py 프로젝트: walbrix/oscar
def htmltotext(os_pathname):
    html = open(os_pathname).read()
    html = nkf.nkf("-w", html)
    return elinks(html)
예제 #16
0
 def encode(self, input, errors='strict'):
     ret = input.encode('utf8', 'replace')
     ret = nkf.nkf('-m0 -x -W -j', ret)
     return (ret, len(input))
예제 #17
0
파일: extract.py 프로젝트: walbrix/oscar
def pdftotext(os_pathname):
    text = process_output(["/usr/bin/pdftotext",os_pathname, "-"])
    return nkf.nkf("-w", text)
예제 #18
0
파일: extract.py 프로젝트: walbrix/oscar
def text(os_pathname):
    if os.stat(os_pathname).st_size > 1024 * 1024 * 10:
        return "***TOO LARGE TEXT FILE***" 
    # else
    text = open(os_pathname).read()
    return nkf.nkf("-w", text)
예제 #19
0
def to_hiragana(s):
    out = nkf.nkf("-w --hiragana", s)
    return out.decode("utf-8")
예제 #20
0

# Wordクラスの情報出力関数
def print_word(word):
    print(word.get_surface())
    print(word.get_pos())
    print(word.get_pos_detail1())
    print(word.get_pos_detail2())
    print(word.get_pos_detail3())


# 日本語を標準出力できるように
sys.stdout = codecs.getwriter("utf_8")(sys.stdout)

contents = open("./appry.txt").read()
contents = nkf.nkf("-w -d", contents)

# 形態素解析する
# 注意:MeCab解析する文字列は必ずencodeされていること.
#       結果は,decodeして使用すること.
# 参考:http://shogo82148.github.io/blog/2012/12/15/mecab-python/
result = MeCab.Tagger("")\
        .parse(contents)\
        .decode("utf-8")

# 形態素をWordクラスにして,その配列を作る
lines = result.split("\n")
pattern = r"^(.*?)\t(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?)$"
word_arr = []
for line in lines:
    iterator = re.finditer(pattern, line)
예제 #21
0
def load_mai2009(filename):
    text = open(filename).read()
    text = nkf.nkf('-w', text)
    text = text.rstrip()
    return text.decode('utf-8')
예제 #22
0
 def encode(self, input, errors='strict'):
     ret = input.encode('utf8', 'replace')
     ret = nkf.nkf('-m0 -x -W -s', ret)
     return (ret, len(input))
예제 #23
0
 def decode(self, input, errors='strict'):
     ret = nkf.nkf('-m0 -x -S -w', input)
     ret = unicode(ret, 'utf8', 'replace')
     return ret, len(input)