예제 #1
0
def convert_to_strings(wikipage):
    # given a wikipage object, the function will return a structurlized
    # dictionary that holds all information from a wikipage.
    from hanziconv import HanziConv
    import wikitextparser as wtp
    import pprint
    try:
        summary = HanziConv.toTraditional(
            wtp.parse(wikipage.content).sections[0].pprint())
    except:
        summary = None
    try:
        sections = [HanziConv.toTraditional(
            sec.pprint()) for sec in wtp.parse(wikipage.content).sections[1:]]
        try:
            sub_titles = [HanziConv.toTraditional(
                sec.title[1:-1]) for sec in wtp.parse(wikipage.content).sections[1:]]
        except:
            sub_titles = None
        try:
            section_content = [s[s.find('\n') + 1:] for s in sections]
        except:
            section_content = None
    except:
        sections = None

    try:
        sections = list(zip(sub_titles, section_content))
    except:
        sections = None
    try:
        links = wikipage.links
    except:
        links = None
    return {'title': wikipage.title, 'summary': summary, 'sections': sections, 'links': links}
예제 #2
0
def convert_encoding_to_utf_8(filename):
    global total_cnt,success_cnt

    flag1 = True
    flag2 = True
    flag3 = True
    content = codecs.open(filename, 'rb').read()
    source_encoding = chardet.detect(content)['encoding']
    total_cnt+=1
    
    filename_trans = HanziConv.toTraditional(filename)
    if(filename_trans == filename):
        flag1 = False
        
        
    if source_encoding != 'utf-8' and source_encoding != 'UTF-8-SIG':
        content = content.decode(source_encoding, 'ignore') #.encode(source_encoding)
    else:
        flag2 = False
        
    content_trans = HanziConv.toTraditional(content)
    if(content_trans == content):
        flag3 = False
    
    if(flag1 or flag2 or flag3):
        backup(filename)
        os.rename(filename, filename_trans)
    
        with open(filename_trans, 'w', encoding='UTF-8-SIG') as file:
            file.write(content_trans)
        success_cnt+=1
예제 #3
0
def convert_dir(root_dir):
    global function_list
    
    # Check if root path is valid
    if os.path.exists(root_dir) == False:
        print("[error] dir:",root_dir,"do not exit")
        return
    
    print("work in", root_dir)
    
    for root, dirs, files in os.walk(root_dir):  # recursively work in folder
        '''Convert folder name'''
        ########################################### Function 檔名轉繁體 start ###########################################
        if(function_list[0] == 1):
            root_trans = HanziConv.toTraditional(root)
            if(root_trans != root):
                os.rename(root, root_trans)
        ########################################### Function 檔名轉繁體 end #############################################
    
    for root, dirs, files in os.walk(root_dir):  # recursively work in folder
        
        # Work with file
        for f in files:
            filename = os.path.join(root, f)
            
            ######################################### Function 檔名轉繁體 start #########################################
            if(function_list[0] == 1):
                filename_trans = HanziConv.toTraditional(filename)
                if(filename_trans != filename):
                    os.rename(filename, filename_trans)
            ######################################### Function 檔名轉繁體 end ###########################################
            
            # Read file once
            if(function_list[1] == 1 or function_list[2] == 1) and (any(suf in filename_trans for suf in suffix)):
                content = codecs.open(filename_trans, 'rb').read()
                backup(filename_trans)
            
            ######################################## Function 檔案編碼轉換 start ########################################
            if(function_list[1] == 1):
                try:
                    if(any(suf in filename_trans for suf in suffix)):
                        content = convert_encoding_to_utf_8(filename_trans, content)
                except:
                    print("Fail Convert utf-8",filename)
            ######################################## Function 檔案編碼轉換 end ##########################################
            
            ######################################### Function 檔案轉繁體 start #########################################
            if(function_list[2] == 1):
                try:
                    if(any(suf in filename_trans for suf in suffix)):
                        toTraditional(filename_trans, content)
                except:
                    print("Fail Convert",filename)
예제 #4
0
 def __init__(self, title, author, author_role, body, form='simplified'):
     if form == 'simplified':
         self.title = HanziConv.toSimplified(title)
         self.author = HanziConv.toSimplified(author)
         self.author_role = HanziConv.toSimplified(author_role)
         self.body = HanziConv.toSimplified(body)
     elif form == 'traditional':
         self.title = HanziConv.toTraditional(title)
         self.author = HanziConv.toTraditional(author)
         self.author_role = HanziConv.toTraditional(author_role)
         self.body = HanziConv.toTraditional(body)
     else:
         raise ValueError(f'Unrecongnized form: {form}')
예제 #5
0
def trad_and_simp(inputString):
    '''
    Takes in a unicode string containing chinese characters and
    makes sure it contains both traditional and simplified versions
    of every character. If both versions are not present, whatever
    is missing is added. The returned string is in no guarenteed
    order, just guarenteed to have both character sets where possible.

    Parameters
    ----------
    inputString : String
        A string containing traditional and/or simplified
        Chinese characters. These will be expanded so that all simplified
        and traditional characters are present.

    Returns
    -------
    String
        A string is returned that contain traditional and simplified
        versions of every Chinese character found in the input string.
    '''
    totalSet = set(inputString)
    totalSet = totalSet.union(HanziConv.toSimplified(inputString))
    totalSet = totalSet.union(HanziConv.toTraditional(inputString))
    return "".join(totalSet)
예제 #6
0
def run(
    app: str = typer.Option(default="Spotify", help="Application to track"),
    debug: bool = typer.Option(default=False,
                               is_flag=True,
                               help="To show debug messages or not"),
    traditional: bool = typer.Option(
        default=False,
        is_flag=True,
        help="Translate lyrics into Traditional Chinese if possible",
    ),
):  # pragma: no cover
    {True: logger.enable, False: logger.disable}[debug]("touchbar_lyric")

    if not debug:
        logger.disable("touchbar_lyric")
        logger.disable("__main__")

    media_info = get_info(app)
    if media_info is None:
        return

    songs = universal_search(media_info.name, media_info.artists)

    for song in songs:
        if song.anchor(media_info.position):
            line: str = song.anchor(media_info.position)
            if traditional:
                line = HanziConv.toTraditional(line)
            print(line)
            break
예제 #7
0
def send_reuqest(user_id, req_text, api_key):

    req_data = {
        "key": api_key,
        "info": HanziConv.toSimplified(req_text),
        #"loc": ""
        "userid": user_id
    }

    ret_data = {
        "success": False,
    }
    try:
        ret = requests.post(TULING_123_URL, data=req_data, timeout=TIMEOUT)
        if ret.status_code == 200:
            ret_data["success"] = True
            ret_data.update(ret.json())
            if 'text' in ret_data:
                txt = ret_data['text']
                ret_data['text'] = HanziConv.toTraditional(txt)
        else:
            print(ret.text)
    except requests.RequestException:
        pass

    return ret_data
예제 #8
0
def translate(translate_file_path):
    with open(file=translate_file_path, mode="r", encoding="utf-8") as file:
        content = file.read()
    with open(file=translate_file_path, mode="w", encoding="utf-8") as file:
        if content:
            content = HanziConv.toTraditional(content)
            file.write(content)
예제 #9
0
파일: util.py 프로젝트: CheHaoKang/US_Stock
    def get_Xueqiu_categories(self):
        from hanziconv import HanziConv
        from selenium import webdriver
        from webdriver_manager.chrome import ChromeDriverManager

        url = 'https://xueqiu.com/hq#exchange=US&industry=3_2&firstName=3&page=1'
        while 1:
            try:
                driver = webdriver.Chrome(ChromeDriverManager().install())
                driver.get(url)
                driver.implicitly_wait(10)

                soup = BeautifulSoup(driver.page_source, 'html.parser')
                categories = {}
                for ele in soup.find_all('i', {'class' : 'list-style'}):
                    if re.search("明星股", ele.parent.text):
                        for li in ele.parent.find_all('li'):
                            key = HanziConv.toTraditional(li.text).strip()
                            link = "https://xueqiu.com/hq{}".format(li.select('a')[0]['href'].strip())
                            categories[key] = link

                driver.quit()
                break
            except:
                traceback.print_exc()
                driver.quit()

        self.GICS_csvs(categories)
예제 #10
0
파일: util.py 프로젝트: CheHaoKang/US_Stock
    def get_stock_info(self, stock_name, use_proxy=True):
        from hanziconv import HanziConv

        headers = {
            'X-Requested-With': 'XMLHttpRequest',
            'Referer': 'http://xueqiu.com/p/ZH010389',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0',
            'Host': 'xueqiu.com',
            #'Connection':'keep-alive',
            #'Accept':'*/*',
            'cookie':'s=iabht2os.1dgjn9z; xq_a_token=02a16c8dd2d87980d1b3ddced673bd6a74288bde; xq_r_token=024b1e233fea42dd2e0a74832bde2c914ed30e79; __utma=1.2130135756.1433017807.1433017807.1433017807.1;'
            '__utmc=1; __utmz=1.1433017807.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); Hm_lvt_1db88642e346389874251b5a1eded6e3=1433017809; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1433017809'
        }

        counter = 0
        while counter < self.RETRY:
            counter += 1

            try:
                proxies = {}
                if use_proxy:
                    proxies = self.get_proxy()
                    print("PROXY => {:}".format(proxies))
                res = requests.get("https://xueqiu.com/S/" + stock_name, headers=headers, proxies=proxies, timeout=self.REQUEST_TIMEOUT)
                reGetStockInfo = re.compile(r"profile-detail.*?\">(.*?)<", re.S | re.UNICODE)
                for stockInfo in reGetStockInfo.findall(res.text):
                    return HanziConv.toTraditional(stockInfo)
            except:
                traceback.print_exc()
                time.sleep(1)

        return ''
예제 #11
0
def cut(string, using_stopwords=True, simplified_convert=True, log=False):
    string = string.lower()
    if simplified_convert:
        string = HanziConv.toSimplified(string)
    with open(os.path.join(BASE_DIR, 'digit_mark.json'),
              encoding='utf-8') as data_file:
        digit_mark = json.load(data_file)
        for digit in digit_mark:
            string = string.replace(digit, ' ')
        tokens = list(jieba.cut_for_search(string))
        if simplified_convert:
            tokens = [HanziConv.toTraditional(i) for i in tokens]
        tokens = [i for i in tokens if i.strip() != '']
    if using_stopwords:
        with open(os.path.join(BASE_DIR, 'stopwords.txt'),
                  encoding='utf-8') as data_file:
            stopwords = [
                line.replace('\n', '') for line in data_file.readlines()
            ]
            if log:
                removed_tokens = [i for i in list(tokens) if i in stopwords]
                if len(removed_tokens) > 0:
                    print('token removed : ' + ", ".join(removed_tokens))
            tokens = [i for i in list(tokens) if i not in stopwords]
    else:
        tokens = list(tokens)
    return tokens
예제 #12
0
def print_in_line_reverse(row, msg, style, lang):
    '''
    string 语句,row行数,direction 方向。原理:字数/行数的余数。
    字数/行数, 余数。决定了文字打印出来的坐标。
    '''
    if not msg:
        return None
    if lang == 'S':
        msg = HanziConv.toSimplified(msg)
    elif lang == 'T':
        msg = HanziConv.toTraditional(msg)

    msg = sub(msg)
    len_col = math.ceil(len(msg) / row)  # 向上取整

    big_line = ''
    for i in range(row):
        line = ''
        for j in range(len_col):
            try:
                line += msg[j * row + i] + style  # 可以通过直接切片的方式,进行取值。错误则不打印连接符。
            except:
                line += '㍐' + style
        line = line[::-1]
        big_line += line + '<br>'
    return big_line
예제 #13
0
 def get_people_name(self):
     if self.get_main_content() != None:
         term_list = segment.seg(
             HanziConv.toSimplified(self.get_main_content()))
         for term in term_list:
             if str(term.nature) == NLP_Constant.people_name_pos:
                 return HanziConv.toTraditional(str(term.word))
     return None
 def simplified_to_traditional(self):
     logging.info("等待中..(簡 to 繁)")
     traditional = open("traditional.txt", "w", encoding="utf-8")
     with open("wiki_text.txt", "r", encoding="utf-8") as simplified:
         for s in simplified:
             traditional.write(HanziConv.toTraditional(s))
     print("成功簡體轉繁體!")
     traditional.close()
예제 #15
0
def toTraditional(filename, content):

    content_trans = HanziConv.toTraditional(content)
    
    if content_trans != content:
        # Write with utf8 encoding
        with open(filename, 'w', encoding='UTF-8-SIG') as file:
            file.write(content_trans)
예제 #16
0
def pre_process(text):
    text = HanziConv.toTraditional(text)

    # load cantonese corpus
    # jb.load_userdict('util/dict/canto_dict.txt')
    vocabs = list(jb.cut(text))
    pp_text = " ".join(vocabs)
    return pp_text
예제 #17
0
def create_post():
    form = PostForm()
    if form.validate_on_submit():
        chinese = HanziConv.toTraditional(form.chinese_content.data)
        title = HanziConv.toTraditional(form.title.data)
        post = Post(author=current_user,
                    title=title,
                    chinese_content=chinese,
                    content=form.content.data,
                    tags=form.tags.data)
        db.session.add(post)
        db.session.commit()
        flash('Your post has been created!', 'success')
        return redirect(url_for('home'))
    return render_template('create_post.html',
                           title='New Post',
                           form=form,
                           legend='New Post')
예제 #18
0
 def process_text(self):
     logging.info("等待中..(簡 to 繁)")
     with open('./word2vec_data/traditional.txt', 'w',
               encoding='utf-8') as fw:
         with open('./word2vec_data/wiki_text.txt', 'r',
                   encoding='utf-8') as f:
             for line in f:
                 line = HanziConv.toTraditional(line)
                 fw.write(line)
예제 #19
0
def inputTest():
    x = input("請說話:")
    # x:token
    y = jerry.get_response(x)
    y = HanziConv.toTraditional(y.text)

    print(type(x))
    print(type(y))
    print(y)
예제 #20
0
 def Transform_ZhTw_Save(self, File_Name, Next_FileName):
     FileRead = []
     with open(File_Name, 'rb') as RawFile:
         for line in RawFile:
             FileRead.append(HanziConv.toTraditional(line))
     with open(Next_FileName, 'wb') as Next_File:
         for i in range(len(FileRead)):
             for j in range(len(FileRead[i])):
                 Next_File.write(FileRead[i][j].encode('utf-8'))
예제 #21
0
def textrankJob(n):
    # get keyword
    keyword = textrankGet(n)
    # read testdata line by line
    for i in range(1, 8):
        with open('./finalResult/' + n + 'dataset' + str(i) + '.csv',
                  'w',
                  newline='',
                  encoding='utf-8') as res:
            writer = csv.writer(res)
            with open('./testData/dataset' + str(i) + '.txt',
                      'r',
                      newline='',
                      encoding='utf-8') as txtfile:
                tr = txtfile.readlines()
                flag = True
                for t in tr:
                    if flag is True:
                        article = t
                    else:
                        # store keyword match on article content
                        keywordMatch = []
                        content = t
                        # start match keyword and content
                        for index in keyword:
                            temp = []
                            for k in index:
                                if n == 'tfidf':
                                    k = HanziConv.toTraditional(k)
                                if k in content:
                                    temp.append(k)
                            keywordMatch.append(temp)
                        # write match result to csv
                        writer.writerow([article.strip()])
                        writer.writerow([content.strip()])
                        if n == 'tfidf':
                            tempkeyword = []
                            string = "Result:"
                            tempkeyword.append(string)
                            for k in keywordMatch[0]:
                                tempkeyword.append(k)
                            writer.writerow(tempkeyword)
                            writer.writerow("\n")
                        else:
                            exp_value = [0.4, 0.5, 0.6]
                            for j in range(3):
                                tempkeyword = []
                                tempkeyword.append(exp_value[j])
                                for k in keywordMatch[j]:
                                    tempkeyword.append(k)
                                writer.writerow(tempkeyword)
                            writer.writerow("\n")

                    flag = not flag

    print("------------------------------------------")
예제 #22
0
 def preprocess(self, line, cond=None):
     line = HanziConv.toTraditional(line)
     # line = re.sub(r"\@[a-z0-9][a-z0-9]*", '', line)
     # line = re.sub(r"\#[a-z0-9][a-z0-9]*", '', line)
     # line = re.split(r"\([a-z][a-z]\)", line.lower())[0]
     if cond == 'only_zh':
         words = [w for w in jieba.cut(line) if is_zh.search(w)]
         line = ' '.join(words)
     line = re.sub("\s+", ' ', line).strip().lower()
     return line
예제 #23
0
def get_words(path):
    words = []
    with codecs.open(path, 'r', 'utf8') as f:
        line = f.readline()
        while line:
            word = line.strip().replace('\n', '')
            word = HanziConv.toTraditional(word)
            words += word
            line = f.readline()
        return words
예제 #24
0
def to_traditional_chinese(content):
    converted = content
    try:
        from hanziconv import HanziConv
        converted = HanziConv.toTraditional(content)
    except ImportError:
        logging.warn(
            'You need to install python module "HanziConv" to convert to traditional Chinese.'
        )
    return converted
예제 #25
0
    def checkUpdate(self):
        _, latest_chapter_title = self.getLatestChapter()

        if latest_chapter_title != self.latest_chapter_title:
            self.latest_chapter_url, self.latest_chapter_title = self.getLatestChapter(
            )
            self.latest_chapter_title_cht = HanziConv.toTraditional(
                self.latest_chapter_title)
            return True
        else:
            return False
예제 #26
0
 def __init__(self, name, url) -> None:
     self.name = name
     self.url = url
     self.code = url.rsplit("/")[-2]
     self.a_link = f"/comic/{self.code}/"
     self.chapter_count = 0
     self.latest_chapter_url, self.latest_chapter_title = self.getLatestChapter(
     )
     self.latest_chapter_title_cht = HanziConv.toTraditional(
         self.latest_chapter_title)
     pass
예제 #27
0
def chatBot_GET_Google(question):
    url = 'https://www.google.com.tw/search?q=' + question + '+維基百科'
    response = requests.get(url)
    if response.status_code == 200:
        bs = BeautifulSoup(response.text, 'lxml')
        wiki_url = bs.find('cite')
        kwd = wiki_url.text.split('/')[-1]
        keyword_trad = HanziConv.toTraditional(kwd)
        return keyword_trad
    else:
        print('請求失敗')
예제 #28
0
    def concept_lookup(self):

        print('find only one conception,so get its commonsense at most 10')

        # 先中文查找
        local_commonsense = Query.base_lookup(HanziConv.toTraditional(self.conceptions))

        if not local_commonsense:
            # 如果没有找到,翻译成英文再次查找
            local_commonsense = Query.base_lookup(self.translator.zh_to_en(self.conceptions))
        self.commonsense = set(local_commonsense)
예제 #29
0
def subot_getGoogle(question):
    url = f'https://www.google.com.tw/search?q={question}+維基百科'
    response = requests.get(url)
    if response.status_code == 200:
        bs = BeautifulSoup(response.text, 'lxml')
        wiki_url = bs.find('cite')
        kwd = wiki_url.text.split('/')[-1]
        keyword_trad = HanziConv.toTraditional(kwd)
        return keyword_trad
    else:
        print('解讀後轉換關鍵字失敗....')
예제 #30
0
 def articles_parser_insert_mysql(self):  #74218
     self.cursor.execute(
         "SELECT id, title, content FROM articles where id >= 198886 and id <= 200000"
     )
     sql = "INSERT INTO articles_parser (id, title_parser_result, content_parser_result) VALUES (%s, %s, %s)"
     results = self.cursor.fetchall()
     for record in results:
         index = record[0]
         title = record[1]
         content = record[2]
         print(index)
         print(title, end="\n\n")
         print(content)
         if title != "":
             title_parser_result = parsing.Parser(
                 re.sub(
                     r"[\s+\.\【\】\‧\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+",
                     "", title))
             if len(title_parser_result) != 0:
                 if title_parser_result[0] == "error":
                     title = HanziConv.toTraditional(title)
                     title_parser_result = parsing.Parser(
                         re.sub(
                             r"[\s+\.\【\】\‧\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+",
                             "", title))
             else:
                 continue
         else:
             continue
         content = re.sub(r'\、|\,|\。|\?|\?|\;|\;|\:|\~|\:|\⋯|\!', '\n',
                          content)
         content_parser_result = ""
         for line in content.split("\n"):
             line = re.sub(
                 r"[\s+\.\【\】\‧\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+",
                 "", line)
             if len(line) >= 4 and '★' not in line and '◆' not in line:
                 print(line)
                 parser_result = parsing.Parser(line)
             else:
                 continue
             if line == "" or len(
                     parser_result) is not 1 or parser_result[0] == 'error':
                 continue
             content_parser_result += parser_result[0]
             content_parser_result += "@"
         time.sleep(self.sleep)
         val = (index, title_parser_result[0], content_parser_result)
         print(title_parser_result[0], end="\n\n")
         print(content_parser_result)
         self.cursor.execute(sql, val)
         self.db.commit()
     self.db.close()
def generate():
    if win.img_shown is 0:
        win.textBrowser.setText("請先開啟圖片")
        win.textBrowser.setFont(QtGui.QFont("Noto Sans Mono CJK TC", 17))
    else:
        win.textBrowser.setText('請稍等...')
        win.textBrowser.setFont(QtGui.QFont("Noto Sans Mono CJK TC", 17))
        predicted_cap = HanziConv.toTraditional(
            predict('./train_captions', "./ckpt-20", win.img_path))
        win.textBrowser.setText(predicted_cap)
        # win.textBrowser.setFont(win.def_font)
        speak(predicted_cap)
예제 #32
0
def hello():
    name = request.form['checking']
    temp_name = HanziConv.toTraditional(name)
    # name = HanziConv.toSimplified(name)
    name = name.encode('utf-8')
    name = urllib2.quote(name)
    url_tem= "http://csclab11.cs.nthu.edu.tw:5000/?q=%s"%name
    result = urllib2.urlopen(url_tem).read()
    #result = json.load(result)
    # print type(result)
    d = json.loads(result)
    kangxi=HanziConv.toTraditional(d["result"])
    # print d["result"]
    # namelist.append(temp_name)
    # resultlist.append(d["result"])
    # result = get_result(name)
    kangxi=kangxi.encode('utf-8')
    kangxi=urllib2.quote(kangxi)
    url_kang="http://kxgen.mqstudiotw.com/?%s"%kangxi
    kangxi_result = urllib2.urlopen(url_kang)
    #print kangxi_result
    return render_template('index.html', name=temp_name,result=d["result"])
예제 #33
0
def writeDBF(filePattern, fullFilePath, dicInput):
	global dbfFileHandle
	global dbfFileIndex
	global writeMax
	# dbfFileHandle = None
	# dbfFileIndex = None

	insertCount = 0; updateCount = 0;
	bFileExists = os.path.exists(fullFilePath) 

	dtWriteDBFStart = datetime.datetime.now()
	# logger.debug("write DBF start")
	today = dtWriteDBFStart.strftime("%Y%m%d")
	fileName = today
	strToken = ""
	if filePattern == "0":
		strToken = "SH"
		fileName += ".SH.txt"
	elif filePattern == "1":
		strToken = "SZ"
		fileName += ".SZ.txt"

	with open(fileName, "w") as text_file:
		for key, value in dicInput.iteritems():
			insertCount += 1

			value = HanziConv.toTraditional(value)
			try:
				value = value.decode("utf8")
			except:
				pass

			strWrite = (u"%s.%s,%s\n" % (key, strToken, value))
			text_file.write(strWrite.encode('utf8'))

	dtWriteDBFEnd = datetime.datetime.now()

	logger.debug("write count : " + str(insertCount) + "/" + str(updateCount))
	logger.debug("write DBF end (" + str(dtWriteDBFEnd - dtWriteDBFStart) + ")")
def get_json_from_page(page):
    from hanziconv import HanziConv
    stopwords = load_stop_words()
    cat_constrain_set = set(tokenize(HanziConv.toTraditional("。".join(page.categories)),stopwords))
    summary_constrain_set = set(tokenize(HanziConv.toTraditional("。".join(page.summary)),stopwords))
    return get_places(page.title,cat_constrain_set|summary_constrain_set)
예제 #35
0
def gen_response(keyword_list):
    dic = {"笑話":"你想要聽我說個笑話嗎", "無聊":"那聽個笑話好嗎"}

    ans = dic[HanziConv.toTraditional(keyword_list[0])]
    print(ans) 
예제 #36
0
    # but it's used this way
    "操你", "草你", "日你",  # f**k you
    "操他", "草他", "日他",  # f**k his
    "操她", "草她", "日她",  # f**k her

    # Discrimination (racial slurs)
    "小日本",  # little Japanese
    "台湾狗",  # Taiwanese dogs
    "共产中国",  # communist Chinese
    "流氓国家",  # rogue country
    "人渣",  # human slag
    "我去",  # this is verbal and bad
    "鬼子"  # devil, usually a suffix
]
BAD = [HanziConv.toSimplified(word) for word in bad_init] + \
      [HanziConv.toTraditional(word) for word in bad_init]

INFORMAL = [
    # Hello
    "你好",  # nǐ hǎo; The standard "hello" greeting.
    "您好",  # nín hǎo; The same "hello" greeting as above
    "你怎么样",  # nǐ zěnmeyàng?; "What's up?", "How are you doing?"

    # Good afternoon
    "午安",  # wǔ'an; note: seldom used in the Mainland.
    "下午好",  # xìawǔ hǎo! Seldom used in the Republic of China

    # Good evening / Good night
    "晚安",  # wǎn'an; Literally "Peace at night", Good night.
    "晚上好",  # wǎnshang hǎo; Good evening!
        m = re.search(ur"^(\[.+?\])(.+?):", s)
        if m:
            s = m.group(2) + m.group(1)
        else:
            m = re.search(ur"^\[.+?\](.*)", s)
            if m:
                s = m.group(1)
    return s

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()

    parser.add_argument("input", action="store", nargs = 1)
    parser.add_argument("output", action="store", nargs = 1)
    parser.add_argument("--encoding", action="store", default="utf_8_sig", nargs=1)
    parser.add_argument("--traditional", action="store_true", default=False)
    args = parser.parse_args()

    buf = codecs.open(args.input[0], "rb", args.encoding).read()

    if args.traditional:
        buf = HanziConv.toTraditional(buf)
    else:
        buf = HanziConv.toSimplified(buf)

    lines = buf.split("\n")
    lines.sort(key = sort_func)
    codecs.open(args.output[0], "wb", args.encoding).writelines(lines)
def get_sentences(page):
    from hanziconv import HanziConv
    sentences = []
    for line in HanziConv.toTraditional(page.content).splitlines():
        sentences.extend(line.split('。'))
    return sentences
new_lines = []

n = 0
for line in lines:
    if line[0] in "#%":
        new_lines.append(line)
        continue
    try:
        cmd, value = line.strip(' ').decode('utf-8').split(u' ', 1)
    except ValueError as e:
        # '\t' 鍵盤對應部份
        new_lines.append(line)
        continue

    newv = HanziConv.toTraditional(value)
    if newv != value:
        # print value ,
        # print ' -> ',
        # print newv
        n += 1
    elif len(value.strip()) > 1:
        print value.strip()
        pass
    else:
        newl = line.strip().split(' ')[0].decode('utf-8') + ' ' + newv
        new_lines.append(newl.encode('utf-8'))

print len(lines)
print n