Пример #1
0
    }],
    ["utf-8:count:null", "123Б測試", {
        "COUNT": 6
    }],
    ["utf-8:count#blah:null", "123Б測試", {
        "BLAH": 6
    }],
    ["utf-8:count#for=lala&for=cjk:null", "123Б測a試bc", {
        "COUNT": 2
    }],
]

passed = True

for c, i, o in iotest:
    p = Bsdconv(c)
    if not p:
        print(Bsdconv.error())
        print("Test failed at %s" % repr([c, i, o]))
        del p
        passed = False
        continue
    r = p.conv(i)
    if o != r:
        print("Test failed at %s" % repr([c, i, o]))
        print("expected(%d): %s" % (len(o), repr(o)))
        print("result(%d): %s" % (len(r), repr(r)))
        passed = False
    del p

for c, d, i in countertest:
Пример #2
0
#!/usr/bin/env python

#mkbonus.py src_list char_list phrase_list

import sys
import re
from bsdconv import Bsdconv

clist=open(sys.argv[2], "w")
plist=open(sys.argv[3], "w")

sc=Bsdconv("utf-8:score#with=cjk:null")
bcv=Bsdconv("utf-8:insert#after=002c:bsdconv-keyword,bsdconv")
bcv_zhtw=Bsdconv("utf-8:zhtw:insert#after=002c:bsdconv-keyword,bsdconv")

sep=re.compile(r"\s+")

f=open(sys.argv[1])
for l in f:
	l = l.strip()
	if l == "":
		continue
	if l.startswith("#"):
		clist.write(l+"\n")
		plist.write(l+"\n")
	a = sep.split(l)
	p = a[0]
	ln = len(p.decode("utf-8"))
	if ln > 1:
		bonus = 6
		p = bcv_zhtw.conv(p).rstrip(",")
Пример #3
0
#!/usr/bin/env python
import os
import sys
from bsdconv import Bsdconv

a = Bsdconv.mktemp("score.XXXXXX")
os.unlink(a[1])
clist = Bsdconv.fopen("characters_list.txt", "w+")

p = Bsdconv("utf-8:score-train:null")
if not p:
    print(Bsdconv.error())
    del p
    sys.exit()

p.ctl(Bsdconv.CTL_ATTACH_SCORE, a[0], 0)
p.ctl(Bsdconv.CTL_ATTACH_OUTPUT_FILE, clist, 0)

p.init()
f = open(sys.argv[1])
s = f.read(1024)
while s:
    p.conv_chunk(s),
    s = f.read(1024)

p.conv_chunk_last(s)
f.close()
Пример #4
0
#!/usr/bin/env python
import sys
from bsdconv import Bsdconv

p = Bsdconv(sys.argv[1])
if not p:
    print(Bsdconv.error())
    del p
    sys.exit()
p.conv_file(sys.argv[2], sys.argv[3])
print(p)
print(p.counter())
del p
Пример #5
0
# -*- coding: utf-8 -*-
# python nfkc_gen.py '⁰¹²³'|sort|uniq
import sys
from bsdconv import Bsdconv

nfkc = Bsdconv("utf-8:nfkc:utf-8")
i = sys.argv[1].decode("utf-8")
for c in i:
    c = c.encode("utf-8")
    d = nfkc.conv(c)
    if c == d:
        continue
    print("{}\t{}".format(d, c))
Пример #6
0
class Crawler(object):
    convert = Bsdconv("ansi-control,byte:big5-defrag:byte,ansi-control|skip,big5:utf-8,bsdconv_raw")

    def __init__(self, host):
        self.host = host
        self.delay = 0
        self.conn = Telnet(host, 3456)
        self.screen = pyte.Screen(80, 24)
        self.stream = pyte.Stream()
        self.screen.mode.discard(pyte.modes.LNM)
        self.stream.attach(self.screen)
        self.display
        self.login()
        self.enter_board('NCTU-Teacher')
        for i in range(input('n - ' + self.last_id + ': '), int(self.last_id) + 1):
            self.get_article(i)

    @property
    def display(self):
        s = self.conn.read_very_eager()
        while not s:
            s = self.conn.read_very_eager()
            time.sleep(self.delay)           
        s = self.convert.conv(s)
        self.stream.feed(s.decode('utf-8'))
        return self.screen_shot

    @property
    def screen_shot(self):
        return "\n".join(self.screen.display).encode("utf-8")

    def close(self):
        self.conn.close()

    def send_enter(self, count=1):
        for i in range(count):
            s = self.send('\r')
            if count == 1:
                return s
    
    def send(self, s):
        self.conn.write(s)
        ret = self.display
        return ret

    def login(self):
        username = '******'
        self.conn.write(username + '\r')
        self.conn.write('\rYY\r')
        self.send_enter(2)

    def enter_board(self, board):
        '''
        Save current board name in self.board
        and lastest article_id in self.last_id
        '''
        self.send('OBOC')
        self.send('s{}'.format(board))
        self.send_enter(2)
        line = self.screen.cursor.y
        self.last_id = re.search(r'(?P<last_id>^\d+) ', self.screen.display[line].strip()).group()
        self.board = board

    def get_article(self, num=None):
        if not num:
            return

        self.send('{}\rOC'.format(num))
        raw_artcle = self.screen.display[:-1]

        status_line = self.screen.display[-1]
        if status_line.find('[Y/n]') != -1:
            self.send('n')
        while status_line.find('(100%)') == -1:
            self.send('OB')
            status_line = self.screen.display[-1]
            raw_artcle.append(self.screen.display[-2])
        self.save_article(num, raw_artcle)

    def term_comm(feed=None, wait=None):
        if feed != None:
            self.conn.write(feed)
            if wait:
                s = self.conn.read_some()
                s = self.convert.conv_chunk(s)
                self.stream.feed(s.decode("utf-8"))
        if wait != False:
            time.sleep(0.1)
            s = self.conn.read_very_eager()
            s = self.convert.conv_chunk(s)
            self.stream.feed(s.decode("utf-8"))
        ret = "\n".join(self.screen.display).encode("utf-8")
        return ret

    def save_article(self, num, content):
        '''
        :param content: a list get from screen
        '''
        chinese_keyword = {
            'board': '看板',
        }

        author_line = content[0].encode('utf-8').split()
        if not chinese_keyword['board'] in author_line:
            return
        _i = author_line.index(chinese_keyword['board'])
        author = ' '.join(author_line[1:_i])

        title_line = content[1].encode('utf-8').split()[1:]
        title = ' '.join(title_line)

        time_line = content[2].encode('utf-8').split()[1:]
        time = ' '.join(time_line)
        if not time.find('(') == -1:
            time = time[time.find('(') + 1:time.find(')')]
        time = time.split()
        time.pop(1)
        time = ' '.join(time)
        print time


        article = '\n'.join(content[3:]).encode('utf-8')

        try:
            post = Teacher.get(bbs_id=num)
            post.content = article
            post.save()
            logger.info('Update: {id}'.format(id=num))
        except Teacher.DoesNotExist:
            post = Teacher.create(author=author,
                title=title,
                pub_time=time,
                content=article,
                bbs_id=num
            )
            logger.info('Insert: {id}'.format(id=num))